###VER ----

#This file is part of the replication code for the paper:
#How Do Individuals in a Radical Echo Chamber React to Opposing Views? Evidence from a Content Analysis of Stormfront
#J Bright, N Marchal, B Ganesh, S Rudinac
#Human Communication Research 48 (1), 116-145. 2022. 
#Contact jonathan.bright@oii.ox.ac.uk with any questions

#The code should run out of the box to recreate the statistics reported in the paper. 
#You will need to adjust the working directory below based on your own file structure. 

#The code has a reasonable amount of comments in it, contact me with any questions. 
#If you are looking for a particular table you can also use ctrl-f to find the code (though you will need to execute most of the data prep section)

#Note as well the file HCR - Replication Code - Diagnostics.R which contains most of the diagnostics reported in the appendix (though some of them are actually in here as well).

#This file is also accompanied by two anonymised datasets

###WD ----
setwd("C:/Users/Simon/Desktop/Stormfront/Echo chambers exist/Data and Analysis")

###LIB ----

library(irr)
library(foreign)
library(readxl)
library(dplyr)
library(tibble)
library(tidyr)
library(ggplot2)
library(forcats)
library(lme4)
library(stargazer)
library(MuMIn)
library(pbkrtest)
library(survival)
library(survAUC)
library(survminer)
library(timereg)
library(markovchain)
library(performance)
library(dplyr)


###FUNC ----
#set of functions for getting key results from a model

#function to grab critical results from an coxph object
output_coxph <- function(model.results) {
  
  results <- broom::tidy(model.results)

    results$stars <- case_when(
    results$p.value < 0.001 ~ '***',
    results$p.value < 0.01 ~ '**',
    results$p.value < 0.05 ~ '*',
    results$p.value < 0.1 ~ '+',
    TRUE ~ ''
  )
  
  #exp results and combine with stars
  results$exp.coef <- exp(results$estimate)
  results$est <- paste(as.character(round(results$exp.coef, 2)), results$stars)
  
  #add observations and R2
  obs <- data.frame('term' = 'observations', 'est' = model.results$n)
  r2 <- data.frame('term' = 'R2', 'est' = round(summary(model.results)$rsq[1], 3))
  
  
  results <- rbind.data.frame(results[c('term', 'est')], r2, obs)
  return(results)
}

#function to grab critical results from an LMER object
#applies kenward - roger approximation for p values 
output_lmer <- function(model.results) {
  #i am leaving these here for the next time I copy/paste this func
  require(MuMIn)
  require(pbkrtest)
  
  #get kr pvals
  #good explanation here: http://mindingthebrain.blogspot.com/2014/02/three-ways-to-get-parameter-specific-p.html
  #NB results are exponentiated 
  df.KR <- get_ddf_Lb(model.results, fixef(model.results))
  results <- data.frame(coef(summary(model.results)))
  results$p.KR <- 2 * (1 - pt(abs(results$t.value), df.KR))
  results$stars <- case_when(
    results$p.KR < 0.001 ~ '***',
    results$p.KR < 0.01 ~ '**',
    results$p.KR < 0.05 ~ '*',
    results$p.KR < 0.1 ~ '+',
    TRUE ~ ''
  )
  
  #exp results and combine with stars
  results$exp.coef <- exp(results$Estimate)
  results$est <- paste(as.character(round(results$exp.coef, 2)), results$stars)
  results$p.KR <- round(results$p.KR, 3)
  #trim rownames
  results <- tibble::rownames_to_column(results, "coef")
  
  #add observations and R2
  r2 <- r.squaredGLMM(model.results)
  obs <- data.frame('coef' = 'observations', 'est' = nobs(model.results), 'p.KR' = '')
  marg <- data.frame('coef' = 'marginal R2', 
                     'est' = round(r2[1], 2), 'p.KR' = '')
  cond <- data.frame('coef' = 'conditional R2', 
                     'est' = round(r2[2], 2), 'p.KR' = '')
  
  results <- rbind.data.frame(results[c('coef', 'est', 'p.KR')], marg, cond, obs)
  return(results)
}


output_glmer <- function(model.results) {
  require(MuMIn)
  
  results <- data.frame(coef(summary(model.results)))
  
  results$stars <- case_when(
    results$Pr...z.. < 0.001 ~ '***',
    results$Pr...z.. < 0.01 ~ '**',
    results$Pr...z.. < 0.05 ~ '*',
    results$Pr...z.. < 0.1 ~ '+',
    TRUE ~ ''
  )
  
  #exp results and combine with stars
  results$exp.coef <- exp(results$Estimate)
  results$est <- paste(as.character(round(results$exp.coef, 2)), 
                       results$stars)
  #trim rownames
  results <- tibble::rownames_to_column(results, "coef")
  
  #add observations and R2
  #note that r2glmm reports something slightly different for GLMER 
  obs <- data.frame('coef' = 'observations', 'est' = nobs(model.results))
  marg <- data.frame('coef' = 'marginal R2', 
                     'est' = round(r.squaredGLMM(model.results)[1], 3))
  cond <- data.frame('coef' = 'conditional R2', 
                     'est' = round(r.squaredGLMM(model.results)[3], 3))
  
  results <- rbind.data.frame(results[c('coef', 'est')], marg, cond, obs)
  return(results)
}

####PREP & DESC ----
#Data preparation and production of descriptive statistics

#this dataset contains individual posts to the forum under study
#each row represents one post
posts <- read.csv('posts-anon.csv', stringsAsFactors = F)

#*Prep ----
#we originally had the idea of having a more complex coding schema
#this is collapsed here into the categories used in the paper
#note 'strong' became the internal way of referring to 'opposing'
#views
posts$cat[posts$cat=='misc'] <- 'unknown'
posts$cat[posts$cat=='weak'] <- 'strong'
posts$cat[posts$forum!='Opposing Views Forum'] <- 'outside'

#determine whether the post was inside or outside the opposing views
#forum
posts$cat0 <- posts$cat
posts$cat0[posts$cat0!='outside'] <- 'inside'
posts$cat0 <- fct_relevel(posts$cat0, c('outside', 'inside'))
posts$cat <- fct_relevel(posts$cat, c('outside', 'consonant', 'strong', 'unknown'))
posts <- droplevels(posts)

#WHY THIS
posts$poster_id <- as.character(posts$poster_id)

#Make a subset of data only within the opposing views forum
posts_ov <- posts[posts$forum=='Opposing Views Forum',]

#*Desc ----
#Descriptive statistics reporeted in Table A3
#And in a variety of places around the paper

table(posts$forum)#posts per forum
table(posts$cat)

desc <- posts %>%
  group_by(cat0) %>%
  summarise(
    n = n(),
    num_comments = sum(total_resps)
)

desc <- posts %>%
  group_by(cat) %>%
  summarise(
    n = n(),
    num_comments = sum(total_resps)
  )

desc <- posts %>%
  group_by(forum) %>%
  summarise(
    n = n(),
    num_comments = sum(total_resps)
  )

sum(posts$total_resps)

summary(posts[posts$total_resps>0,]$avg_usr_post_vols_before)

summary(posts$content_length)

#*Users ----

#further descriptives on users following R&R requests
#again these are found in table A3 and throughout

#this dataset contains history of user posting histories
#each row is one incident of a post to the forum made by one user
usr_gap <- read.csv('users-anon.csv', stringsAsFactors = F)
usrs <- usr_gap %>%
  group_by(user_id) %>%
  summarise(
    num_posts = n(),
    only.OV = sum(has.non.OV == 'True') == 0, 
    made_dissonant_post = sum(made_dissonant_post == 'True') > 0 
  )

#the users with [] in their user ID is the 'guest' users
#i.e. it is not one person but all guests to the forum 
usrs_anon <- usrs[usrs$user_id=='[]',]

5073 / (1468 + 34369)

usrs <- usrs %>% filter(user_id!='[]')


table(usrs$only.OV)
prop.table(table(usrs$only.OV))
table(usrs$made_dissonant_post)
prop.table(table(usrs$made_dissonant_post))

table(usrs$made_dissonant_post, usrs$only.OV)
prop.table(table(usrs$made_dissonant_post, usrs$only.OV))


####ANALYSIS ----
#*H1 Avoidance ---- 
#These are the models 1.0 - 1.2 contained in Table 1
#The outOV models were also a slight diagnostic check

m1.0 <- lmer(log(total_resps+1) ~ cat0 + log(content_length+1) + (1|month) + (1|poster_id), data=posts)
m1.0outov <- lmer(log(total_resps_outside_ov+1) ~ cat0 + log(content_length+1) + (1|month) + (1|poster_id), data=posts)
m1.1 <- lmer(log(total_resps+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts)
m1.1outov <- lmer(log(total_resps_outside_ov+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts)
m1.2 <- lmer(log(total_resps+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_ov)
m1.2outov <- lmer(log(total_resps_outside_ov+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_ov)

#These are models M2.0 - 2.2 contained in table 1
#response based models only make sense for posts with at least one respondent
#hence we do some subsets here 
posts_resps <- posts[posts$total_resps_outside_ov>0,]
posts_resps_ov <- posts_ov[posts_ov$total_resps_outside_ov>0,]

m2.0 <- lmer(log(avg_usr_post_vols_before+1) ~ cat0 + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps)
m2.0outov <- lmer(log(avg_usr_post_vols_before_outside_ov+1) ~ cat0 + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps)
m2.1 <- lmer(log(avg_usr_post_vols_before+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps)
m2.1outov <- lmer(log(avg_usr_post_vols_before_outside_ov+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps)
m2.2 <- lmer(log(avg_usr_post_vols_before+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps_ov)
m2.2outov <- lmer(log(avg_usr_post_vols_before_outside_ov+1) ~ cat + log(content_length+1) + (1|month) + (1|poster_id), data=posts_resps_ov)


#Output to create the data for table 1

fname <- 'Hypothesis 1 - Responses - Posts Level.csv'
write('', fname)#create empty file
write('\nModel 1.0\n', fname, append=T)
write.table(output_lmer(m1.0), fname, sep=',', row.names=F, append=T)
write('\nModel 1.0outov\n', fname, append=T)
write.table(output_lmer(m1.0outov), fname, sep=',', row.names=F, append=T)
write('\nModel 1.1\n', fname, append=T)
write.table(output_lmer(m1.1), fname, sep=',', row.names=F, append=T)
write('\nModel 1.1outov\n', fname, append=T)
write.table(output_lmer(m1.1outov), fname, sep=',', row.names=F, append=T)
write('\nModel 1.2\n', fname, append=T)
write.table(output_lmer(m1.2), fname, sep=',', row.names=F, append=T)
write('\nModel 1.2outov\n', fname, append=T)
write.table(output_lmer(m1.2outov), fname, sep=',', row.names=F, append=T)
write('\nModel 2.0\n', fname, append=T)
write.table(output_lmer(m2.0), fname, sep=',', row.names=F, append=T)
write('\nModel 2.0outov\n', fname, append=T)
write.table(output_lmer(m2.0outov), fname, sep=',', row.names=F, append=T)
write('\nModel 2.1\n', fname, append=T)
write.table(output_lmer(m2.1), fname, sep=',', row.names=F, append=T)
write('\nModel 2.1outov\n', fname, append=T)
write.table(output_lmer(m2.1outov), fname, sep=',', row.names=F, append=T)
write('\nModel 2.2\n', fname, append=T)
write.table(output_lmer(m2.2), fname, sep=',', row.names=F, append=T)
write('\nModel 2.2outov\n', fname, append=T)
write.table(output_lmer(m2.2outov), fname, sep=',', row.names=F, append=T)

#*ICC Addition ----
#Added in the calculation of the conditional ICC at request of one reviewer. 
#Also reported in Table 1
icc(m1.0)
icc(m1.1)
icc(m1.2)
icc(m2.0)
icc(m2.1)
icc(m2.2)

#*H2 Abandonment ----
#This is the gap time model reported in Table 2

#**Gap time data prep ----
#This is basically the same prep as above, but for the gap time data

usr_gap <- read.csv('users-anon.csv', stringsAsFactors = F)

#remove the unknown user - not relevant for this analysis
usr_gap <- usr_gap[usr_gap$user_id!='[]',]

table(usr_gap$cat)
table(usr_gap$has.non.OV)
table(usr_gap$made_dissonant_post)

#recoding
usr_gap$cat[usr_gap$cat=='misc'] <- 'unknown'
usr_gap$cat[usr_gap$cat=='weak'] <- 'strong'

#any non OV forum posts 
usr_gap$cat[!usr_gap$cat %in% c('strong', 'consonant', 'unknown')] <- 'outside'

usr_gap$cat0 <- usr_gap$cat
usr_gap$cat0[usr_gap$cat0!='outside'] <- 'inside'

usr_gap$cat0 <- fct_relevel(usr_gap$cat0, c('outside', 'inside'))
usr_gap$cat <- fct_relevel(usr_gap$cat, c('outside', 'consonant', 'strong', 'unknown'))


usr_gap <- droplevels(usr_gap)

#the first post is removed as it doesn't have a meaningful 'gap' (i.e. gap from post 1 -> 2 is the first considered)
usr_gap <- usr_gap[usr_gap$num_prev!=0,]

#arbitrary 30 second increment to gap times is included here
#this is added as the post data is measured at the minute level
#and some replies are less than a minute apart
#however the model will not accept a gap of zero as valid
usr_gap$gap <- usr_gap$gap + 30

#a vector of zeroes. these are the start times for each gap
usr_gap$zero <- 0

#subset the data
usr_gap_nonov <- usr_gap[usr_gap$has.non.OV=='True'&
                      usr_gap$made_dissonant_post=='False',]

#*num users, seniority ----
nrow(data.frame(table(usr_gap$user_id)))

#look at num obs per strata to decide truncation
stratacounts <- data.frame(table(usr_gap_nonov$num_prev))
#strata 169 is the limit of at least 50 observations per strata
usr_gap_trunc <- usr_gap_nonov[usr_gap_nonov$num_prev<=169,]
#this is the truncated model described in appendix A2.2

#not.censored - 0: censored, 1: not censored


#**Model----
#Models reported in Table 2
#And also some diagnostic models reported in A2.2

m3.0=coxph(Surv(zero,gap,not.censored) ~ cat0 + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data =usr_gap)

m3.0.trunc.diag =coxph(Surv(zero,gap,not.censored) ~ cat0  + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data =usr_gap_trunc)

m3.0.nonov=coxph(Surv(zero,gap,not.censored) ~ cat0 + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data = usr_gap_nonov)

m3.1=coxph(Surv(zero,gap,not.censored) ~ cat + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data =usr_gap)

m3.1.trunc.diag =coxph(Surv(zero,gap,not.censored) ~ cat  + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data =usr_gap_trunc)

m3.1.nonov=coxph(Surv(zero,gap,not.censored) ~ cat + log10(respond_chain+1) + cluster(user_id) + strata(num_prev), method="breslow",  data = usr_gap_nonov)


#output
fname <- 'Hypothesis 2 - Gap Time Analysis - Posts Level.csv'
write('', fname)#create empty file
write('\nGap Time Model - Post Views Level\n', fname, append=T)
write.table(output_coxph(m3.0), fname, sep=',', row.names=F, append=T)
write('\nGap Time Model - Post Views Level - Truncated diag\n', fname, append=T)
write.table(output_coxph(m3.0.trunc.diag), fname, sep=',', row.names=F, append=T)
write('\nGap Time Model - Post Views Level - NON OV diag\n', fname, append=T)
write.table(output_coxph(m3.0.nonov), fname, sep=',', row.names=F, append=T)
write('\nGap Time Model - Post Views Level\n', fname, append=T)
write.table(output_coxph(m3.1), fname, sep=',', row.names=F, append=T)
write('\nGap Time Model - Post Views Level - Truncated diag\n', fname, append=T)
write.table(output_coxph(m3.1.trunc.diag), fname, sep=',', row.names=F, append=T)
write('\nGap Time Model - Post Views Level - NON OV diag\n', fname, append=T)
write.table(output_coxph(m3.1.nonov), fname, sep=',', row.names=F, append=T)


#*H3 Consonance ----
#This is the model reported in Table 3 and also Appendix A2.3

#**Markov Chain ----

#Data
#this is the same as the gap time data *but* we don't get rid of the first entry
#so we are doing the data preparation again here

usr_mc <- read.csv('users-anon.csv', stringsAsFactors = F)

#remove the unknown user and people who are dissonant
usr_mc <- usr_mc[usr_mc$user_id!='[]',]

usr_mc <- usr_mc[usr_mc$has.non.OV=='True'&
                         usr_mc$made_dissonant_post=='False',]

#recoding
usr_mc$cat[usr_mc$cat=='misc'] <- 'unknown'
usr_mc$cat[usr_mc$cat=='weak'] <- 'strong'
usr_mc$cat[usr_mc$cat=='strong'] <- 'dissonant'
#any non OV forum posts 
usr_mc$cat[!usr_mc$cat %in% c('dissonant', 'consonant', 'unknown')] <- 'outside'


#we can simply pass the category column to create markov chains
#first obs for each user is an 'NA' which is considered a break/ reset to new chain by the packages 
mcf <- markovchainFit(data = usr_mc$cat)


#graphical representation shown in Fig A2
mcf_plot <- mcf$estimate
mcf_plot@transitionMatrix <- round(mcf_plot@transitionMatrix, 2)
plot(mcf_plot, package='DiagrammeR')


mcf$confidenceLevel
#can use markovchainFit confidencelevel = 0.99 for other probs
mcf <- markovchainFit(data = usr_mc$cat, confidencelevel = 0.95)
#mcf <- markovchainFit(data = usr_mc$cat, confidencelevel = 0.999)
#shows diff is signif at 0.01 but not 0.001

#summarise results as data frame 
dfe <- data.frame(mcf$estimate@transitionMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, estimate, -from)

dfll <- data.frame(mcf$lowerEndpointMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, ll, -from)

dful <- data.frame(mcf$upperEndpointMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, ul, -from)

df <- dfe %>% 
  left_join(dfll, by = c('from', 'to')) %>% 
  left_join(dful, by = c('from', 'to')) %>% 
  mutate_if(is.numeric, round, 4)

write.csv(df, 'dtmc-transition-probs.csv')

#further diagnostic results

mcfnonov <- markovchainFit(data = usr_mc$cat[usr_mc$has.non.OV=='True'])

mcfnonov$confidenceLevel

dfe <- data.frame(mcfnonov$estimate@transitionMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, estimate, -from)

dfll <- data.frame(mcfnonov$lowerEndpointMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, ll, -from)

dful <- data.frame(mcfnonov$upperEndpointMatrix) %>% rownames_to_column(var = 'from') %>%
  gather(to, ul, -from)

df <- dfe %>% 
  left_join(dfll, by = c('from', 'to')) %>% 
  left_join(dful, by = c('from', 'to'))
